lagdiff <- function(n) { # 前日差を求める関数
n - dplyr::lag(n, default = 0L)
}
ma7 <- function(n) { # 移動平均(7日)を求める関数
zoo::rollmeanr(n, k = 7L, na.pad = TRUE)
}
ma28 <- function(n) { # 移動平均(28日)を求める関数
zoo::rollmeanr(n, k = 28L, na.pad = TRUE)
}
daily_aggregate <- function(df, date, key) { # 日時集計を行う関数
date <- dplyr::enquo(date)
key <- dplyr::enquo(key)
df %>%
dplyr::group_by(!!date, !!key) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::ungroup() %>%
tidyr::complete(
date = seq.Date(from = min(!!date), to = max(!!date), by = "day"),
!!key, fill = list(n = 0L)
) %>%
dplyr::group_by(!!key) %>%
tidyr::nest() %>%
dplyr::mutate(
diff = purrr::map(data, ~ lagdiff(.$n)), # 前日差
cum = purrr::map(data, ~ cumsum(.$n)), # 累計
ma7 = purrr::map(data, ~ ma7(.$n)), # 移動平均(7日)
ma28 = purrr::map(data, ~ ma28(.$n)) # 移動平均(28日)
) %>%
tidyr::unnest(cols = c(data, diff, cum, ma7, ma28)) %>%
return()
}
subtitle <- paste0("Generated @", lubridate::now())
caption <- "Data Source: covid19japan.com"
個票データの集計に限らず、データをインポートした際には目的に応じて各変量(変数)の変数型を変更します。特に水準ごとに層別処理を行いたい場合には因子型に変換しておくと便利です。また、結合したいデータと名称や体系を合わせておくこともポイントです。
prefs <- "https://gist.githubusercontent.com/k-metrics/9f3fc18e042850ff24ad9676ac34764b/raw/f4ea87f429e1ca28627feff94b67c8b2432aee59/pref_utf8.csv" %>%
readr::read_csv() %>%
dplyr::mutate(
# Googleの予測データと結合するためにコード体系を合わせる
japan_prefecture_code = paste0("JP-", `コード`)
) %>%
dplyr::select(
# Googleの予測データと結合するために名称を変更する
japan_prefecture_code, prefecture_name = pref,
# 日本語の変数名は扱いにくいので英語名に変更する
pref = `都道府県`, region = `八地方区分`, pops = `推計人口`
) %>%
dplyr::mutate(
# 水準ごとに表示させるために因子化する(あらかじめデータをコード順に並べて
# おくことが因子化の際のポイントのひとつ)
japan_prefecture_code = forcats::fct_inorder(japan_prefecture_code),
pref = forcats::fct_inorder(pref),
region = forcats::fct_inorder(region),
pops = as.integer(pops)
)
prefs
df <- "https://raw.githubusercontent.com/reustle/covid19japan-data/master/docs/patient_data/latest.json" %>%
jsonlite::fromJSON() %>%
dplyr::select(patientId, date = dateAnnounced, gender,
pref = detectedPrefecture, patientStatus, knownCluster,
confirmedPatient, ageBracket,
deceasedDate, deceasedReportedDate) %>%
dplyr::filter(confirmedPatient == TRUE) %>%
dplyr::mutate(
date = lubridate::as_date(date),
gender = forcats::as_factor(gender),
pref = stringr::str_to_lower(pref),
patientStatus = forcats::as_factor(patientStatus),
cluster = dplyr::if_else(!is.na(knownCluster), TRUE, FALSE),
ageBracket = forcats::as_factor(ageBracket),
deceasedDate = lubridate::as_date(deceasedDate),
deceasedReportedDate = lubridate::as_date(deceasedReportedDate)
) %>%
# 都道府県データと結合
dplyr::left_join(prefs, by = c("pref" = "prefecture_name")) %>%
dplyr::select(-pref) %>%
dplyr::rename(pref = pref.y) %>%
# 因子型の欠損値を水準化する
dplyr::mutate(
japan_prefecture_code = forcats::fct_explicit_na(japan_prefecture_code,
na_level = "JP-99"),
pref = forcats::fct_explicit_na(pref, na_level = "空港検疫"),
region = forcats::fct_explicit_na(region, na_level = "空港検疫"),
gender = forcats::fct_explicit_na(gender, na_level = "非公表"),
ageBracket = forcats::fct_explicit_na(ageBracket, na_level = "非公表"),
patientStatus = forcats::fct_explicit_na(patientStatus,
na_level = "Unknown")
) %>%
dplyr::filter(date < lubridate::today())
df
df %>%
skimr::skim()
| Name | Piped data |
| Number of rows | 368645 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 3 |
| factor | 6 |
| logical | 2 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| patientId | 0 | 1.00 | 1 | 16 | 0 | 368645 | 0 |
| knownCluster | 366140 | 0.01 | 3 | 88 | 0 | 233 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2020-01-15 | 2021-01-25 | 2020-12-15 | 363 |
| deceasedDate | 368265 | 0 | 2020-02-13 | 2020-11-19 | 2020-05-08 | 151 |
| deceasedReportedDate | 368315 | 0 | 2020-02-13 | 2020-10-17 | 2020-05-16 | 131 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| gender | 0 | 1 | FALSE | 3 | 非公表: 258973, M: 61370, F: 48302 |
| patientStatus | 0 | 1 | FALSE | 9 | Unk: 366111, Hos: 1261, Dec: 372, Hom: 315 |
| ageBracket | 0 | 1 | FALSE | 13 | 非公表: 259070, 20: 29433, 30: 19042, 40: 16089 |
| japan_prefecture_code | 0 | 1 | FALSE | 48 | JP-: 94576, JP-: 41728, JP-: 38429, JP-: 23628 |
| pref | 0 | 1 | FALSE | 48 | 東京都: 94576, 大阪府: 41728, 神奈川: 38429, 埼玉県: 23628 |
| region | 0 | 1 | FALSE | 9 | 関東地: 188800, 近畿地: 72944, 中部地: 37817, 九州地: 32196 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| confirmedPatient | 0 | 1 | 1.00 | TRU: 368645 |
| cluster | 0 | 1 | 0.01 | FAL: 366140, TRU: 2505 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| pops | 2142 | 0.99 | 7924.54 | 4235.09 | 560 | 5107 | 7537 | 13822 | 13822 | ▆▅▆▇▇ |
japan_daily <- df %>%
dplyr::group_by(date) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::ungroup() %>%
tidyr::complete(
date = seq.Date(from = min(date), to = max(date), by = "day"),
fill = list(n = 0L)
) %>%
dplyr::mutate(
diff = lagdiff(n), # 前日差
cum = cumsum(n), # 累計
ma7 = ma7(n), # 移動平均(7日)
ma28 = ma28(n) # 移動平均(28日)
)
japan_daily
region_daily <- df %>%
dplyr::group_by(date, region) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::ungroup() %>% # この処理がポイント
tidyr::complete(
date = seq.Date(from = min(date), to = max(date), by = "day"),
region, fill = list(n = 0L)
) %>%
print() %>% # 途中結果の表示
dplyr::group_by(region) %>%
tidyr::nest() %>%
print() %>% # 途中結果の表示
# 組み合わせグルーピングの場合は purrr パッケージで処理するのが速い
dplyr::mutate(
diff = purrr::map(data, ~ lagdiff(.$n)), # 前日差
cum = purrr::map(data, ~ cumsum(.$n)), # 累計
ma7 = purrr::map(data, ~ ma7(.$n)), # 移動平均(7日)
ma28 = purrr::map(data, ~ ma28(.$n)) # 移動平均(28日)
) %>%
tidyr::unnest(cols = c(data, diff, cum, ma7, ma28))
region_daily
region_daily <- df %>%
daily_aggregate(date, region) # 上記の集計処理を関数化したもの
region_daily
pref_daily <- df %>%
daily_aggregate(date, pref)
pref_daily
ageBracket_daily <- df %>%
daily_aggregate(date, ageBracket)
ageBracket_daily
cluster_daily <- df %>%
daily_aggregate(date, cluster)
cluster_daily
subset <- japan_daily
title <- "【全国】陽性者数(単日)"
xlab <- ""
ylab <- ""
sec_scale <- 50
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date)) +
ggplot2::geom_bar(ggplot2::aes(y = n), stat = "identity", width = 1.0,
fill = "dark gray", alpha = 1.0) +
ggplot2::geom_line(ggplot2::aes(y = ma7), linetype = "solid",
colour = "gray10", size = 0.5) +
ggplot2::geom_line(ggplot2::aes(y = cum / sec_scale),
colour = "dark green", size = 1.0) +
ggplot2::scale_y_continuous(
name = "陽性者数(灰)・移動平均(黒)",
sec.axis = ggplot2::sec_axis(~ . * sec_scale,
name = "累積陽性者数(濃緑)")) +
ggplot2::theme(axis.text.y.left = ggplot2::element_text(colour = "gray10"),
axis.line.y.left = ggplot2::element_line(colour = "gray10"),
axis.text.y.right = ggplot2::element_text(colour = "dark green"),
axis.line.y.right = ggplot2::element_line(colour = "dark green")) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- japan_daily %>% dplyr::mutate(y = diff)
title <- "【全国】前日差(単日)"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y)) +
ggplot2::geom_line(colour = "dark green", alpha = 0.5) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = n)
title <- "【地方別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y)) +
ggplot2::geom_bar(ggplot2::aes(fill = key), stat = "identity",
width = 1.0, alpha = 0.5) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = n)
title <- "【地方別】単日"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 0.5) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = cum)
title <- "【地方別】累計"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 1) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = ma7)
title <- "【地方別】移動平均(7日)"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 1) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = ma28)
title <- "【地方別】移動平均(28日)"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 1) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = diff)
title <- "【地方別】前日差(単日)"
xlab <- ""
ylab <- "陽性者数"
ncol <- 3
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(alpha = 0.75) +
ggplot2::theme(legend.position = 'none') +
ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y") +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- pref_daily %>% dplyr::mutate(key = pref, y = n)
title <- "【都道府県別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 0.5) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- pref_daily %>% dplyr::mutate(key = pref)
title <- "【都道府県別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"
sec_scale <- 50
ncol <- 3
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date)) +
ggplot2::geom_bar(ggplot2::aes(y = n, fill = key), stat = "identity",
alpha = 0.25, width = 1.0) +
ggplot2::geom_line(ggplot2::aes(y = ma7, colour = key),
linetype = "solid", size = 0.25) +
ggplot2::geom_line(ggplot2::aes(y = cum / sec_scale, colour = key)) +
ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y") +
ggplot2::theme(legend.position = 'none') +
ggplot2::scale_y_continuous(
name = "陽性者数(棒)・移動平均(細線)",
sec.axis = ggplot2::sec_axis(~ . * sec_scale,
name = "陽性者数累計(太線)")) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- pref_daily %>% dplyr::mutate(key = pref, y = diff)
title <- "【都道府県別】前日差(単日)"
xlab <- ""
ylab <- "陽性者数"
ncol <- 3
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(alpha = 0.75) +
ggplot2::theme(legend.position = 'none') +
ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y") +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- ageBracket_daily %>% dplyr::mutate(key = ageBracket, y = n)
title <- "【年代別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y)) +
ggplot2::geom_bar(ggplot2::aes(fill = key), stat = "identity",
width = 1.0, alpha = 0.5) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- cluster_daily %>% dplyr::mutate(key = cluster, y = n)
title <- "【クラスター別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y)) +
ggplot2::geom_bar(ggplot2::aes(fill = key), stat = "identity",
width = 1.0, alpha = 0.5) +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = cum)
title <- "【地方別】累計"
xlab <- ""
ylab <- "陽性者数(常用対数)"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 1) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::scale_y_log10() +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = ma7)
title <- "【地方別】移動平均(7日)"
xlab <- ""
ylab <- "陽性者数(常用対数)"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 1) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::scale_y_log10() +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
subset <- region_daily %>% dplyr::mutate(key = region, y = ma28)
title <- "【地方別】移動平均(28日)"
xlab <- ""
ylab <- "陽性者数(常用対数)"
subset %>%
ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) +
ggplot2::geom_line(size = 1) +
ggplot2::theme(legend.position = 'none') +
ggrepel::geom_text_repel(ggplot2::aes(label = key),
data = subset(subset, date == max(date)),
nudge_x = 30, segment.alpha = 0.5, size = 4) +
ggplot2::lims(x = c(min(subset$date),
max(subset$date) + 45)) +
ggplot2::scale_y_log10() +
ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
x = xlab, y = ylab)
CC 4.0 BY-NC-SA, Sampo Suzuki